##############################################################################
# Filename: Robustness_DataQuality.R
# Purpose: Produce SI Table A5
##############################################################################

source("Setup.R")

# ChecksPass variable to track observations
data_full$ChecksPass = 1 

### Checks

## No Incompletes
data_full = data_full[which(data_full$CompletionStatus=="Finished"),]
data_full = droplevels(data_full)

## Age on first page and year of birth on last page
data_full$confirmyear = as.numeric(as.character(data_full$confirmyear))
expyear1 = 2019-as.numeric(as.character(data_full$Age))
expyear2 = 2019-as.numeric(as.character(data_full$Age))-1
expyear3 = 2019-as.numeric(as.character(data_full$Age))+1
agediscrep = which(data_full$confirmyear!=expyear1 & data_full$confirmyear!=expyear2 & data_full$confirmyear!=expyear3)
# data_full[agediscrep,c("Age","confirmyear")]
data_full[agediscrep,"ChecksPass"] = 0

## State on first page and state on last page
statediscrep = integer()
statescheck = read.csv("StatesAndAbbs_forChecks.csv")
for (i in 1:nrow(data_full)){
  x = which(as.character(statescheck$State)==data_full[i,"StateLive"])
  statediscrep[i] = ifelse(data_full[i,"confirmstate"]==statescheck[x,"Abbreviation"],1,0)
  if (statediscrep[i] == 0 & data_full[i,"StateLive"] == "Delhi"){
    statediscrep[i] = ifelse(data_full[i,"confirmstate"]=="UP",1,0)
  }  
  if (statediscrep[i] == 0 & data_full[i,"StateLive"] == "Telangana"){
    statediscrep[i] = ifelse(data_full[i,"confirmstate"]=="AP",1,0)
  }
  if (statediscrep[i] == 0){
    y = which(as.character(statescheck$Language)==as.character(data_full[i,"Lang"]))
    if (length(y)!=0){
      statediscrep[i] = ifelse(data_full[i,"confirmstate"] %in% statescheck[y,"Abbreviation"] | data_full[i,"StateLive"] %in% statescheck[y,"State"],
                               1,0)
    }    
  }
}
# data_full[which(statediscrep==0),c("ResponseId","StateLive","confirmstate","Lang")]
data_full[which(statediscrep==0),"ChecksPass"] = 0

## Time taken 
rushing = which(data_full$Duration_minutes<5)
data_full[rushing,"ChecksPass"] = 0

## Participated in NYU research study recently
data_full[which(data_full$PriorStudy!="No"),"ChecksPass"] = 0

## Source
data_full[which(data_full$SawFB+data_full$SawIG==0),"ChecksPass"] = 0 


### Set up dataframes to only have observations that passed checks
data_full = data_full[which(data_full$ChecksPass==1),]
data_full = droplevels(data_full)
data_eval = data_full[which(data_full$CompletionStatus=="Finished" | data_full$CompletionStatus=="PolicyopsDone" | data_full$CompletionStatus=="NewsConsArea"),]
data_eval = data_eval[which(data_eval$Article!="pl"),]
data_eval = droplevels(data_eval)
data_op = data_full[which(data_full$CompletionStatus=="Finished" | data_full$CompletionStatus=="PolicyopsDone"),]
data_op = droplevels(data_op)
# Set neutral values for those in treatment groups who didn't write a comment
data_eval$CommentBiased[is.na(data_eval$CommentBiased)&data_eval$Article%in%c("DN","DP")] <- 0
data_eval$CommentFeeling[is.na(data_eval$CommentFeeling)&data_eval$Article%in%c("DN","DP")] <- 2


### Table A6
mod_correct_checkspassed = lm(YesCorrect ~ ProBJP*Article 
                         + CollegeGrad + NewsDaily + StrongInterestPolitics,
                         data = data_eval)
mod_ww_checkspassed = lm(YesWellWritten ~ ProBJP*Article 
                    + CollegeGrad + NewsDaily + StrongInterestPolitics,
                    data = data_eval)
mod_recm_checkspassed = lm(YesRecommend ~ ProBJP*Article 
                      + CollegeGrad + NewsDaily + StrongInterestPolitics,
                      data = data_eval)
mod_bias_checkspassed = lm(CommentBiased ~ ProBJP*Article 
                      + CollegeGrad + NewsDaily + StrongInterestPolitics, 
                      data = data_eval) 
mod_feel_checkspassed = lm(as.numeric(CommentFeeling) ~ ProBJP*Article 
                      + CollegeGrad + NewsDaily + StrongInterestPolitics, 
                      data = data_eval) 
mod_dem_checkspassed = lm(as.numeric(op_dem) ~ ProBJP*Article
                     + CollegeGrad + NewsDaily + StrongInterestPolitics,
                     data = data_op)
stargazer(mod_correct_checkspassed, mod_ww_checkspassed, mod_recm_checkspassed, 
          mod_bias_checkspassed, mod_feel_checkspassed, 
          mod_dem_checkspassed,
          title="Main results with observations that pass data quality checks",
          align=TRUE,
          covariate.labels=c("Pro-BJP",
                             "Negative Article", "Positive Article",
                             "College Graduate", "Daily News", "Interested in Politics",
                             "Pro-BJP x Negative Article","Pro-BJP x Positive Article"),
          omit.stat=c("LL","ser","f"), no.space=TRUE)